# "Targeting Coethnic Voters, Elites or Both?"
# Matched data set
# Dongil Lee
# 10/4/2024

# Part I. Setup -----

rm(list=ls())

library(dplyr, warn.conflicts = FALSE)
library(tidyverse)
library(stringr)
library(MatchIt)
library(kableExtra)
library(haven)

setwd("/Users/dlee/Dropbox/Work/NYU/2016 Summer/MA_thesis")
cnt <- "/" # cnt denotes connector

# Part II: Prepare 2007 general election data set ----

file_path <- paste("combined_data", "data_with_MP_add.dta", sep=cnt) 
df <- haven::read_dta(file_path)
df <- as.data.frame(df) # From tibble to data frame

df$year<-as.integer(df$year)
df$const_code<-as.integer(df$const_code)

df1 <- df %>% 
  filter(is.na(MPcoethnic_regime1)!=TRUE) %>%
  filter(is.na(infant)!=TRUE)

df1_cs <- df1 %>% filter(year==1999)

df2 <- df %>%
  filter(is.na(MPcoethnic_regime2)!=TRUE) %>%
  filter(is.na(infant)!=TRUE)

df2_cs <- df2 %>% filter(year==2004)

df3 <- df %>% 
  filter(is.na(MPcoethnic_regime3)!=TRUE) %>%
  filter(is.na(infant)!=TRUE)

df3_cs <- df3 %>% filter(year==2009)

###################################
# Nearest Neighbor Matching (NMM) #
# Mahalanobis distance            #
###################################

nnm_m1_out <- matchit(MPcoethnic1 ~ logpop + infant + coethnic_c, # logpop + infant + coethnic_c
                      data = df1_cs,
                      method = "nearest",
                      distance = "mahalanobis")

nnm_m2_out <- matchit(MPcoethnic2 ~ logpop + infant + coethnic_c,
                      data = df2_cs,
                      method = "nearest",
                      distance = "mahalanobis")

nnm_m3_out <- matchit(MPcoethnic3 ~ logpop + infant+ coethnic_c,
                      data = df3_cs,
                      method = "nearest",
                      distance = "mahalanobis")

sum_nnm_m1_out <- summary(nnm_m1_out)
sum_nnm_m2_out <- summary(nnm_m2_out)
sum_nnm_m3_out <- summary(nnm_m3_out)

cov_imbal_1<-sum_nnm_m1_out$sum.matched
sum_nnm_m1_out$nn
cov_imbal_2<-sum_nnm_m2_out$sum.matched
sum_nnm_m2_out$nn
cov_imbal_3<-sum_nnm_m3_out$sum.matched
sum_nnm_m3_out$nn

rownames(cov_imbal_1)<-c("Population", "Infant Mortality", "Coethnic Voter Prop.")
rownames(cov_imbal_2)<-c("Population", "Infant Mortality", "Coethnic Voter Prop.")
rownames(cov_imbal_3)<-c("Population", "Infant Mortality", "Coethnic Voter Prop.")

# Tables A9-11.
cov_imbal_1 %>%
  kbl(caption="Covariate Imbalance for the Muluzi Regime",
      format="latex",
      digits=2, 
      booktabs = T,
      align="c") %>% kable_minimal(full_width = F)

cov_imbal_2 %>%
  kbl(caption="Covariate Imbalance for the Mutharika I Regime",
      format="latex",
      digits=2, 
      booktabs = T,
      align="c") %>% kable_minimal(full_width = F)

cov_imbal_3 %>%
  kbl(caption="Covariate Imbalance for the Mutharika II Regime",
      format="latex",
      digits=2, 
      booktabs = T,
      align="c") %>% kable_minimal(full_width = F)

# Figures A6-8
pdf(file = "/Users/dlee/Dropbox/Work/NYU/2016 Summer/MA_thesis/graphs2/cov_bal1.pdf", width = 8, height = 4) 
plot(summary(nnm_m1_out))
dev.off()

pdf(file = "/Users/dlee/Dropbox/Work/NYU/2016 Summer/MA_thesis/graphs2/cov_bal2.pdf", width = 8, height = 4) 
plot(summary(nnm_m2_out))
dev.off()

pdf(file = "/Users/dlee/Dropbox/Work/NYU/2016 Summer/MA_thesis/graphs2/cov_bal3.pdf", width = 8, height = 4) 
plot(summary(nnm_m3_out))
dev.off()

matched_df1 <- df1[nnm_m1_out$weights %>% as.logical(), ]
matched_df2 <- df2[nnm_m2_out$weights %>% as.logical(), ]
matched_df3 <- df3[nnm_m3_out$weights %>% as.logical(), ]

file_path1 <- paste("combined_data", "nnm1.dta", sep=cnt) 
file_path2 <- paste("combined_data", "nnm2.dta", sep=cnt) 
file_path3 <- paste("combined_data", "nnm3.dta", sep=cnt) 

haven::write_dta(matched_df1, file_path1)
haven::write_dta(matched_df2, file_path2)
haven::write_dta(matched_df3, file_path3)